Ограничение на $P_z$¶
from starter import *
Baseline¶
threshold = [0, 150]
X = data.drop(['type'], axis=1)[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]
y = data[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]['type']
y[y == 1].size
plt.figure(figsize=(55, 21))
pairs = combinations(['Px', 'Py', 'X', 'Y', 'Pz'], 2)
for num, pair in enumerate(pairs, 1):
plt.subplot(2, 5, num)
plt.scatter(X[y == 1][pair[0]], X[y == 1][pair[1]])
plt.scatter(X[y == 0][pair[0]], X[y == 0][pair[1]], alpha=0.05)
plt.xlabel(pair[0])
plt.ylabel(pair[1])
plt.show()
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X, y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
Первая группа¶
threshold = [50, 150]
X = data.drop(['type'], axis=1)[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]
y = data[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]['type']
y[y == 1].size
plt.figure(figsize=(55, 21))
pairs = combinations(['Px', 'Py', 'X', 'Y', 'Pz'], 2)
for num, pair in enumerate(pairs, 1):
plt.subplot(2, 5, num)
plt.scatter(X[y == 1][pair[0]], X[y == 1][pair[1]])
plt.scatter(X[y == 0][pair[0]], X[y == 0][pair[1]], alpha=0.05)
plt.xlabel(pair[0])
plt.ylabel(pair[1])
plt.show()
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X, y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
X['centr_dist1'] = X['Px'] ** 2 + X['Py'] ** 2
X['centr_dist2'] = X['Px'] ** 2 + X['Y'] ** 2
X['centr_dist2'] = X['Py'] ** 2 + X['X'] ** 2
X['centr_dist3'] = X['X'] ** 2 + X['Y'] ** 2
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X, y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
plt.figure(figsize=(10, 10))
plt.scatter(X[y == 1]['Px'], X[y == 1]['X'])
plt.scatter(X[y == 0]['Px'], X[y == 0]['X'], alpha=0.05)
k1, b1 = np.polyfit(X[y == 0]['Px'], X[y == 0]['X'], deg=1)
x = np.linspace(-2, 2, 2)
plt.plot(x, k1 * x + b1, 'black')
plt.show()
X['line_dist1'] = np.abs((X['X'] - (k1 * X['Px'] + b1)) / (k1 ** 2 + 1) ** 0.5)
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X, y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
plt.figure(figsize=(10, 10))
plt.scatter(X[y == 1]['Py'], X[y == 1]['Y'])
plt.scatter(X[y == 0]['Py'], X[y == 0]['Y'], alpha=0.05)
k2, b2 = np.polyfit(X[y == 0]['Py'], X[y == 0]['Y'], deg=1)
x = np.linspace(-2, 2, 2)
plt.plot(x, k2 * x + b2, 'black')
plt.show()
X['line_dist2'] = np.abs((X['Y'] - (k2 * X['Py'] + b2)) / (k2 ** 2 + 1) ** 0.5)
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X, y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
SMOTE
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=state)
os = SMOTE(random_state=42, n_jobs=-1)
columns = Xtrain.columns
os_data_X, os_data_y = os.fit_resample(Xtrain, ytrain)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
X_train.append(os_data_X)
X_test.append(Xtest)
y_train.append(os_data_y)
y_test.append(ytest)
get_metrics(X_train, X_test, y_train, y_test)
Вторая группа¶
threshold = [10, 50]
X = data.drop(['type'], axis=1)[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]
y = data[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]['type']
y[y == 1].size
plt.figure(figsize=(55, 21))
pairs = combinations(['Px', 'Py', 'X', 'Y', 'Pz'], 2)
for num, pair in enumerate(pairs, 1):
plt.subplot(2, 5, num)
plt.scatter(X[y == 1][pair[0]], X[y == 1][pair[1]])
plt.scatter(X[y == 0][pair[0]], X[y == 0][pair[1]], alpha=0.05)
plt.xlabel(pair[0])
plt.ylabel(pair[1])
plt.show()
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X, y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
plt.figure(figsize=(10, 10))
plt.scatter(X[y == 1]['Px'], X[y == 1]['Py'])
plt.scatter(X[y == 0]['Px'], X[y == 0]['Py'], alpha=0.05)
k1, b1 = np.polyfit(X[y == 1]['Px'], X[y == 1]['Py'], deg=1)
x = np.linspace(-2, 2, 2)
plt.plot(x, k1 * x + b1, 'black')
plt.show()
X['centr_dist1'] = X['Px'] ** 2 + X['Py'] ** 2
X['line_dist1'] = np.abs((X['Py'] - (k1 * X['Px'] + b1)) / (k1 ** 2 + 1) ** 0.5)
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X, y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
plt.figure(figsize=(10, 10))
plt.scatter(X[y == 1]['Px'], X[y == 1]['Y'])
plt.scatter(X[y == 0]['Px'], X[y == 0]['Y'], alpha=0.05)
k2, b2 = np.polyfit(X[y == 1]['Px'], X[y == 1]['Y'], deg=1)
x = np.linspace(-2, 2, 2)
plt.plot(x, k2 * x + b2, 'black')
plt.ylim(-500, 500)
plt.show()
X['axis_dist_Px'] = np.abs(X['Px'])
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X.drop(['Px'], axis=1), y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
SMOTE
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=state)
os = SMOTE(random_state=42, n_jobs=-1)
columns = Xtrain.columns
os_data_X, os_data_y = os.fit_resample(Xtrain, ytrain)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
X_train.append(os_data_X)
X_test.append(Xtest)
y_train.append(os_data_y)
y_test.append(ytest)
get_metrics(X_train, X_test, y_train, y_test)
Третья группа¶
threshold = [0, 10]
X = data.drop(['type'], axis=1)[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]
y = data[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]['type']
y[y == 1].size
plt.figure(figsize=(55, 21))
pairs = combinations(['Px', 'Py', 'X', 'Y', 'Pz'], 2)
for num, pair in enumerate(pairs, 1):
plt.subplot(2, 5, num)
plt.scatter(X[y == 1][pair[0]], X[y == 1][pair[1]])
plt.scatter(X[y == 0][pair[0]], X[y == 0][pair[1]], alpha=0.05)
plt.xlabel(pair[0])
plt.ylabel(pair[1])
plt.show()
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X.drop(['Px'], axis=1), y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
X['centr_dist1'] = X['Px'] ** 2 + X['Py'] ** 2
X['axis_dist_Px'] = np.abs(X['Px'])
X['axis_dist_Py'] = np.abs(X['Py'])
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X.drop(['Px'], axis=1), y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
plt.figure(figsize=(10, 10))
plt.scatter(X[y == 1]['Px'], X[y == 1]['X'])
plt.scatter(X[y == 0]['Px'], X[y == 0]['X'], alpha=0.05)
k1, b1 = np.polyfit(X[y == 0]['Px'], X[y == 0]['X'], deg=1)
x = np.linspace(-0.5, 0.5, 2)
plt.plot(x, k1 * x + b1, 'black')
plt.show()
X['line_dist1'] = np.abs((X['X'] - (k1 * X['Px'] + b1)) / (k1 ** 2 + 1) ** 0.5)
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X.drop(['Px'], axis=1), y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
plt.figure(figsize=(10, 10))
plt.scatter(X[y == 1]['Py'], X[y == 1]['Y'])
plt.scatter(X[y == 0]['Py'], X[y == 0]['Y'], alpha=0.05)
k2, b2 = np.polyfit(X[y == 0]['Py'], X[y == 0]['Y'], deg=1)
x = np.linspace(-0.5, 0.5, 2)
plt.plot(x, k2 * x + b2, 'black')
plt.show()
X['line_dist2'] = np.abs((X['Y'] - (k2 * X['Py'] + b2)) / (k2 ** 2 + 1) ** 0.5)
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X.drop(['Px'], axis=1), y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test)
X['in_center1'] = np.where(X['Px'] ** 2 + X['Py'] ** 2 < 0.01, 1, 0)
X['in_center2'] = np.where(np.abs(X['Px']) < 0.1, 1, 0)
X['in_center3'] = np.where(np.abs(X['Py']) < 0.1, 1, 0)
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
split = train_test_split(X.drop(['Px'], axis=1), y, test_size=0.3, random_state=state)
X_train.append(split[0])
X_test.append(split[1])
y_train.append(split[2])
y_test.append(split[3])
get_metrics(X_train, X_test, y_train, y_test, cat_ft=['in_center1', 'in_center2', 'in_center3'])
SMOTE
X_train = []
X_test = []
y_train = []
y_test = []
for state in states:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=state)
os = SMOTE(random_state=42, n_jobs=-1)
columns = Xtrain.columns
os_data_X, os_data_y = os.fit_resample(Xtrain, ytrain)
os_data_X = pd.DataFrame(data=os_data_X, columns=columns)
X_train.append(os_data_X)
X_test.append(Xtest)
y_train.append(os_data_y)
y_test.append(ytest)
get_metrics(X_train, X_test, y_train, y_test, cat_ft=['in_center1', 'in_center2', 'in_center3'])
Соединение¶
X_train = []
X_test = []
y_train = []
y_test_1 = []
for state in states:
threshold = [50, 150]
X1 = data.drop(['type'], axis=1)[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]
y1 = data[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]['type']
X1['centr_dist1'] = X1['Px'] ** 2 + X1['Py'] ** 2
X1['centr_dist2'] = X1['Px'] ** 2 + X1['Y'] ** 2
X1['centr_dist2'] = X1['Py'] ** 2 + X1['X'] ** 2
X1['centr_dist3'] = X1['X'] ** 2 + X1['Y'] ** 2
k1, b1 = np.polyfit(X1[y1 == 0]['Px'], X1[y1 == 0]['X'], deg=1)
X1['line_dist1'] = np.abs((X1['X'] - (k1 * X1['Px'] + b1)) / (k1 ** 2 + 1) ** 0.5)
k2, b2 = np.polyfit(X1[y1 == 0]['Py'], X1[y1 == 0]['Y'], deg=1)
X1['line_dist2'] = np.abs((X1['Y'] - (k2 * X1['Py'] + b2)) / (k2 ** 2 + 1) ** 0.5)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=state)
os = SMOTE(random_state=42, n_jobs=-1)
columns = X1_train.columns
os_data_X1, os_data_y1 = os.fit_resample(X1_train, y1_train)
os_data_X1 = pd.DataFrame(data=os_data_X1, columns=columns)
X_train.append(os_data_X1)
X_test.append(X1_test)
y_train.append(os_data_y1)
y_test_1.append(y1_test)
model_1 = get_metrics(X_train, X_test, y_train, y_test_1, vis=False, proba=True)
X_train = []
X_test = []
y_train = []
y_test_2 = []
for state in states:
threshold = [10, 50]
X2 = data.drop(['type'], axis=1)[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]
y2 = data[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]['type']
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=state)
X_train.append(X2_train)
X_test.append(X2_test)
y_train.append(y2_train)
y_test_2.append(y2_test)
model_2 = get_metrics(X_train, X_test, y_train, y_test_2, vis=False, proba=True)
X_train = []
X_test = []
y_train = []
y_test_3 = []
for state in states:
threshold = [0, 10]
X3 = data.drop(['type'], axis=1)[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]
y3 = data[(data['Pz'] > threshold[0]) * (data['Pz'] < threshold[1])]['type']
X3['centr_dist1'] = X3['Px'] ** 2 + X3['Py'] ** 2
X3['axis_dist_Px'] = np.abs(X3['Px'])
X3['axis_dist_Py'] = np.abs(X3['Py'])
k1, b1 = np.polyfit(X3[y3 == 0]['Px'], X3[y3 == 0]['X'], deg=1)
X3['line_dist1'] = np.abs((X3['X'] - (k1 * X3['Px'] + b1)) / (k1 ** 2 + 1) ** 0.5)
k2, b2 = np.polyfit(X3[y3 == 0]['Py'], X3[y3 == 0]['Y'], deg=1)
X3['line_dist2'] = np.abs((X3['Y'] - (k2 * X3['Py'] + b2)) / (k2 ** 2 + 1) ** 0.5)
X3['in_center1'] = np.where(X3['Px'] ** 2 + X3['Py'] ** 2 < 0.01, 1, 0)
X3['in_center2'] = np.where(np.abs(X3['Px']) < 0.1, 1, 0)
X3['in_center3'] = np.where(np.abs(X3['Py']) < 0.1, 1, 0)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=state)
X_train.append(X3_train)
X_test.append(X3_test)
y_train.append(y3_train)
y_test_3.append(y3_test)
model_3 = get_metrics(X_train, X_test, y_train, y_test_3, cat_ft=['in_center1', 'in_center2', 'in_center3'], vis=False, proba=True)
log_reg_auc = []
rnd_frst_auc = []
cat_bst_auc = []
xg_bst_auc = []
plt.figure(figsize=(25, 25))
plt.suptitle('ROC curves for different classifiers', y=0.92)
for i in range(len(states)):
y_test = np.hstack((y_test_1[i], np.hstack((y_test_2[i], y_test_3[i]))))
log_reg_proba = np.hstack((model_1[0][i], np.hstack((model_2[0][i], model_3[0][i]))))
plt.subplot(2, 2, 1)
fpr, tpr, _ = roc_curve(y_test, log_reg_proba)
plt.plot(fpr, tpr)
log_reg_auc.append(roc_auc_score(y_test, log_reg_proba))
cat_bst_proba = np.hstack((model_1[2][i], np.hstack((model_2[2][i], model_3[2][i]))))
plt.subplot(2, 2, 3)
fpr, tpr, _ = roc_curve(y_test, cat_bst_proba)
plt.plot(fpr, tpr)
cat_bst_auc.append(roc_auc_score(y_test, cat_bst_proba))
xg_bst_proba = np.hstack((model_1[3][i], np.hstack((model_2[3][i], model_3[3][i]))))
plt.subplot(2, 2, 4)
fpr, tpr, _ = roc_curve(y_test, xg_bst_proba)
plt.plot(fpr, tpr)
xg_bst_auc.append(roc_auc_score(y_test, xg_bst_proba))
rnd_frst_proba = np.hstack((model_1[1][i], np.hstack((model_2[1][i], model_3[1][i]))))
plt.subplot(2, 2, 2)
fpr, tpr, _ = roc_curve(y_test, rnd_frst_proba)
plt.plot(fpr, tpr, label='state {}'.format(states[i]))
rnd_frst_auc.append(roc_auc_score(y_test, rnd_frst_proba))
box = {'facecolor':'black', 'edgecolor': 'red', 'boxstyle': 'round'}
plt.subplot(2, 2, 1)
plt.title('Logistic Regression')
plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.text(0.8, 0.05, 'AUC = {:4f} ± {:4f}'.format(np.mean(log_reg_auc), np.std(log_reg_auc)), horizontalalignment = 'center', bbox = box, color = 'white')
plt.subplot(2, 2, 3)
plt.title('CatBoost')
plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.text(0.8, 0.05, 'AUC = {:4f} ± {:4f}'.format(np.mean(cat_bst_auc), np.std(log_reg_auc)), horizontalalignment = 'center', bbox = box, color = 'white')
plt.subplot(2, 2, 4)
plt.title('XGBoost')
plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.text(0.8, 0.05, 'AUC = {:4f} ± {:4f}'.format(np.mean(xg_bst_auc), np.std(rnd_frst_auc)), horizontalalignment = 'center', bbox = box, color = 'white')
plt.subplot(2, 2, 2)
plt.title('Randomn Forest')
plt.plot([0, 1], [0, 1], linestyle='--', color='black')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.text(0.8, 0.05, 'AUC = {:4f} ± {:4f}'.format(np.mean(rnd_frst_auc), np.std(rnd_frst_auc)), horizontalalignment = 'center', bbox = box, color = 'white')
plt.legend(loc = 'upper right')
plt.show()